---
title: "Construction of the Analysis Dataset"
author: "Prof. Andrew Eggers"
format: html
---

# Overview 

This document takes the raw data from Qualtrics and CloudResearch and constructs the "analysis dataset", i.e. the dataset that team members should use in writing up the Methods and Results sections of the paper.

It implements the rules for recoding variables and excluding observations that were described in the pre-analysis plan.

I have used **boldface** below to highlight new information that I added. The text is otherwise from the document submitted with the PAP.


# Data management and cleaning 

## Loading the Qualtrics data and initial cleaning

```{r}
#| message: false
#| output: false
library(tidyverse)
knitr::opts_chunk$set(dev = "ragg_png")
```

Here we load the raw data from Qualtrics: 

```{r}
#| message: false
#| output: false

qualtrics_filename <- "data/SSI3+2024+SRM+Version+4_April+30,+2024_20.52.csv"
# first we just extract the column names -- because there are multiple rows of columns names, this version of the dataset does not have the correct data types (everything is a character) 
headers <- read_csv(qualtrics_filename) |> colnames()
# then we load a version that skips the first two columns: correct data types, incorrect column names
dat <- read_csv(qualtrics_filename, skip = 2)
# and then we apply correct column names
names(dat) <- headers
```

Next we reformat some of the columns to be ordered factors:

```{r}
great_deal_levels <- c("Not at all", "A little", "Somewhat", "A lot", "A great deal")
agreement_levels <- c("Strongly disagree", "Somewhat disagree", "Neither agree nor disagree", "Somewhat agree", "Strongly agree")
worry_levels <- c("Not at all worried", "A little worried", "Moderately worried", "Very worried", "Extremely worried")

dat |> 
  # exclude any observations collected in preview mode
  filter(DistributionChannel != "preview") |> 
  # make ordered factors
  mutate(
         ### pre-treatment attitudes
         cc_concern_pre = factor(cc_concern_pre, levels = c("Moderately concerned", "Not concerned at all", "Slightly concerned", "Very concerned", "Extremely concerned")), # putting 'Moderately concerned' first means this is the omitted category in regressions
         self_efficacy_pre = factor(self_efficacy_pre, levels = c("Neither agree nor disagree", "Strongly disagree", "Somewhat disagree", "Somewhat agree", "Strongly agree")), # same for "Neither agree nor disagree" 
         # post-treatment responses
         tech_threat_post = factor(tech_threat_post, levels = agreement_levels, ordered = T),
         mech_belief_post = factor(mech_belief_post, levels = agreement_levels, ordered = T),
         solve_wo_reduce_post = factor(solve_wo_reduce_post, levels = agreement_levels, ordered = T),
         cc_affect_self_post = factor(cc_affect_self_post, levels = great_deal_levels, ordered = T),
         ccaffect_others_post = factor(ccaffect_others_post, levels = great_deal_levels, ordered = T),
         response_treaty = factor(response_treaty, levels = agreement_levels, ordered = T),
         response_car_eff = factor(response_car_eff, levels = agreement_levels, ordered = T),
         response_renewables = factor(response_renewables, levels = agreement_levels, ordered = T),
         response_carbon_tax = factor(response_carbon_tax, levels = agreement_levels, ordered = T),
         response_nuclear = factor(response_nuclear, levels = agreement_levels, ordered = T),
         response_ind_use = factor(response_ind_use, levels = agreement_levels, ordered = T),
         response_elec_emit = factor(response_elec_emit, levels = agreement_levels, ordered = T),
         response_clean_nrg = factor(response_clean_nrg, levels = agreement_levels, ordered = T),
         # treatment categories
         SAI_either = as.integer(Condition %in% c("SAI", "Both")),
         Emissions_either = as.integer(Condition %in% c("Emissions", "Both"))
         ) -> dat2
```

Next we prepare our post-treatment outcomes. We create an index of support for emission mitigation policies and an index of worry about climate change, and we standardize each of our measures: 

```{r}
dat2 |> 
  mutate(outcome_index = as.integer(response_treaty) +
           as.integer(response_car_eff) + 
           as.integer(response_renewables) + 
           as.integer(response_carbon_tax) + 
           as.integer(response_nuclear) + 
           as.integer(response_ind_use) + 
           as.integer(response_elec_emit) + 
           as.integer(response_clean_nrg),
         worry_index = as.integer(cc_affect_self_post) + as.integer(ccaffect_others_post),
         # note: the scale function standardizes a variable -- 
         # converts it to have mean of 0 and standard deviation of 1
         standardized_policy_index = scale(outcome_index),
         standardized_worry_index = scale(worry_index),
         standardized_mech_belief = scale(as.integer(mech_belief_post)),
         standardized_tech_threat = scale(as.integer(tech_threat_post)),
         standardized_techno_fix = scale(as.integer(solve_wo_reduce_post)),
         Condition = factor(Condition, levels = c("Neither", "Emissions", "SAI", "Both"))) -> dat3
```

And we order and categorize some of our covariates for ease of use: 

```{r}
dat3 |> 
  mutate(Affiliation = factor(Affiliation, levels = c("Moderate", "Very conservative", "Conservative", "Not sure", "Liberal", "Very liberal")), # again putting factor out of order so that middle category is omitted group in regression s
         Affiliation_3cats = case_when(Affiliation %in% c("Very conservative", "Conservative") ~ "Conservative",
                                       Affiliation %in% c("Liberal", "Very liberal") ~ "Liberal",
                                       Affiliation == "Moderate" ~ "Moderate",
                                       TRUE ~ NA_character_),
         Affiliation_3cats = factor(Affiliation_3cats, levels = c("Liberal", "Moderate", "Conservative")),
         attribution_quintile = factor(ntile(causal_attribution_1, n = 5), levels = c(3, 1, 2, 4, 5))) -> dat4
         
```



## Loading and combining with the CloudResearch covariates 

Load in data about the respondents, from CloudResearch: 

```{r}
#| message: false
#| output: false

cloudresearch_filename <- "data/assignments_3c2b52ff-7586-4809-874c-dd4df3f210e8.csv"
covariates <- read_csv(cloudresearch_filename) |> select(ParticipantId, Age, Education, Sex, `Occupation Field`, `Relationship/Marital Status`, `Political Party`, `Gender`, `Country Of Residence`, `Household Income`, Race, `Employment Status`)
```

Combine with the Qualtrics data: 

```{r}
dat4 |> 
  left_join(covariates, by = c("participantId" = "ParticipantId")) -> D
```



# Data exclusions

## Response time 

We will exclude respondents whose log response time is outside the interval $q_{25} - 1.5\times IQR, q_{75} + 1.5\times IQR$, where $q_{x}$ is the $x$th percentile of log response time and IQR is the interquartile range of log response times. We do this exclusion *by treatment arm*, given that the amount of text shown varies across treatment arms.

```{r}
D |> 
  group_by(Condition) |> 
  mutate(outlier = log(`Duration (in seconds)`) > quantile(log(`Duration (in seconds)`), .75) + 1.5*IQR(log(`Duration (in seconds)`)) | log(`Duration (in seconds)`) < quantile(log(`Duration (in seconds)`), .25) - 1.5*IQR(log(`Duration (in seconds)`))) -> D_with_outliers 

D_with_outliers |> 
  ggplot(aes(x = `Duration (in seconds)`, y = Condition, col = outlier)) + 
  geom_jitter() + 
  scale_x_log10()

D_with_outliers |> 
  filter(outlier & `Duration (in seconds)` < 70) |> 
  count(attention_check)
```

**It's interesting that these speeders mostly get the attention check correct! Still I think they could not have read the text.**

**I output the ids so that I could reject the speeders and approve the rest. (If you take too long I think it makes sense to exclude but I feel bad withholding the payment.)**

```{r}
D_with_outliers |> 
  ungroup() |> 
  filter(outlier & `Duration (in seconds)` < 70) |> 
  select(Participant = participantId) |> 
  mutate(`Message (Optional)` = "You took less than 70 seconds to complete the survey, which is not enough time to read the material you were asked to read (the median time was over 4 minutes). We therefore can't use your responses.") |> 
  write_csv("data/rejections_for_speeders.csv")

D_with_outliers |> 
  ungroup() |> 
  filter(!(outlier & `Duration (in seconds)` < 70)) |> 
  select(Participant = participantId) |> 
  mutate(`Message (Optional)` = "Thank you for participating in our survey") |> 
  write_csv("data/approvals.csv")
```

**This is the number of outliers by treatment condition:**

```{r}
D_with_outliers |> 
  count(outlier, .drop = F) |> 
  mutate(N = sum(n)) |> 
  filter(outlier) |> 
  select(-outlier) |> 
  rename(Outliers = n, Total = N)

D_with_outliers |> 
  filter(!outlier) |> 
  ungroup() -> D_without_outliers

```



## Attention checks 

We will exclude respondents who fail our attention check. The rationale is that our target population is Americans who are plausibly paying attention to the information being provided and attempting to answer honestly. We can of course include people who are not interested and are speeding through a survey (and we will include them if we see evidence that treatment affects the probability of failing the attention check), but we slightly favor to focus on the attentive subset of the population.

We first define a variable that indicates whether the respondent passed, and if they failed *how* they failed: 

```{r}
D_without_outliers |>
  mutate(attention_check_status = case_when(attention_check == "Strongly disagree,Neither agree nor disagree" ~ "Passed",
                                            !str_detect(attention_check, "\\,") ~ "Failed: single response",
                                            TRUE ~ "Failed: more than one response")) -> D_woo_ac_coded
```

**The proportion of attention check failures by type overall**:
```{r}
D_woo_ac_coded |> 
  count(attention_check_status) |> 
  mutate(prop = n/sum(n))
```


**The proportion of attention check failures by type and condition**:

```{r}
D_woo_ac_coded |> 
  group_by(Condition) |> 
  count(attention_check_status) |> 
  mutate(prop = n/sum(n))
```

**The proportions failing the attention check are very similar to those in the pilot, and to those in last year's survey on a different platform.**

We would like to know whether the treatment condition affects the probability of failing the attention check. We'll judge this by the p-value on the F-statistic from a regression of an indicator for failing the attention check on the treatment conditions:

```{r}
attention_check_reg <- estimatr::lm_robust(I(attention_check_status %in% c("Failed: single response", "Failed: more than one response")) ~ Condition, data = D_woo_ac_coded)

# a function to get the p-value from the F-test for a regression -- couldn't figure out how to get it from regression output
fstat_p_val_from_reg <- function(reg){
  fstat <- summary(reg)$fstatistic
  1 - pf(fstat[1], fstat[2], fstat[3])
}

summary(attention_check_reg)

fstat_bits <- summary(attention_check_reg)$fstatistic

(attention_check_p_val <- fstat_p_val_from_reg(attention_check_reg))
```

If this p-value is below .1, we have evidence that treatment affected attention and therefore that excluding inattentive respondents may cause the treatment groups to not be comparable. (This may also have happened even if the probability of failing is the same across treatment groups but selection into inattentiveness differs across treatment groups.) In that case we will not exclude respondents who fail the attention check. Otherwise, we will exclude them on the grounds that we are really interested in effects within the attentive population. 

Here we implement these exclusions: 

```{r}
if(attention_check_p_val < .1){
  D_woo_ac_coded -> D_post_exclusions
}else{
  D_woo_ac_coded |> 
    filter(attention_check_status == "Passed") -> D_post_exclusions
}

```

**Because this p-value was above .1, according to the PAP we proceed with excluding respondents who failed the attention check.**   


## Checking randomization and balance across treatment groups

Because we have excluded respondents who went very fast or very slow, and because we may have excluded respondents who failed an attention check, we will report on balance in pre-treatment covariates across treatment groups. If no serious concerns about balance arise, we will report only the p-values on the F-tests for the three regressions below and include the regression table in the appendix only if asked for it.

We will run regressions in which the outcome is an indicator for being assigned to a specific treatment group (e.g. "Both") restricting to those assigned to either that group or "Neither". The predictors are pre-treatment covariates chosen in the previous section. The test for balance is the F-test, which compares the fit of the model to the fit of a model with no predictors. 

If we reject the null at the .01 level in any of these three comparisons, we will run the same analysis for the pre-exclusion data, and if we are unable to reject the null at the .1 level in any of these comparisons we will focus on the pre-exclusion data in the paper. Otherwise we will assume that the imbalance we observe occurred by chance and will proceed with the analysis (using covariates in all cases as planned). 

```{r}
balance_reg_Emissions <- lm(I(Condition == "Emissions") ~ Affiliation + cc_concern_pre + attribution_quintile + self_efficacy_pre, data = D_post_exclusions |> filter(Condition %in% c("Neither", "Emissions"))) 

balance_reg_SAI <- lm(I(Condition == "SAI") ~ Affiliation + cc_concern_pre + attribution_quintile + self_efficacy_pre, data = D_post_exclusions |> filter(Condition %in% c("Neither", "SAI"))) 

balance_reg_Both <- lm(I(Condition == "Both") ~ Affiliation + cc_concern_pre + attribution_quintile + self_efficacy_pre, data = D_post_exclusions |> filter(Condition %in% c("Neither", "Both"))) 

# adding a row at the bottom with the p-values on the F-tests
rows <- tribble(~term, ~term, ~term, ~term,
                # see function for getting F-test p-value from regression output 
               "p-value from F-test", fstat_p_val_from_reg(balance_reg_Emissions), fstat_p_val_from_reg(balance_reg_SAI), fstat_p_val_from_reg(balance_reg_Both))
attr(rows, "position") <- 38 # put it at the bottom

modelsummary::modelsummary(list("Emissions" = balance_reg_Emissions, "SAI" = balance_reg_SAI, "Both" = balance_reg_Both), stars = T, gof_map = c("nobs", "r.squared", "adj.r.squared"), add_rows = rows) |> 
  tinytable::group_tt(j = list("DV: Treated" = 2:4))
```

**All three p-values are above .01 (the threshold we specified in the PAP), though one of them is below .05. This means that, following our PAP, our main analysis will exclude respondents who went too fast or slow as well as respondents who failed the attention check, because the balance across treatment groups is not bad enough to warrant including them. Ultimately, the appendix to the paper will include results without these exclusions, because the exclusions are based on post-treatment characteristics. But for the purposes of the class, just focus on the data with the exclusions.** 

**Incidentally, the p-value on the first regression is even lower without the exclusions, indicating that the exclusions are not the problem:**

```{r}
balance_reg_Emissions_no_exclusions <- lm(I(Condition == "Emissions") ~ Affiliation + cc_concern_pre + attribution_quintile + self_efficacy_pre, data = D |> filter(Condition %in% c("Neither", "Emissions"))) 
summary(balance_reg_Emissions_no_exclusions)
```


**Here we save the post-exclusion dataset and the pre-inclusion dataset just in case you want it.**

```{r}
save(D_post_exclusions, D, file = "data/D_and_D_post_exclusions_v2.RData")
```



